library(xtable)
library(MASS)
library(plyr)
set.seed(16021985)


# number of subjects:
n <- 100

# Randomly uniform age between 20 and 80:								
age <-  runif(n, 20, 80)

# Time as a quadratic function of age plus noise:					
time <- 320 + 25*age - .3*age^2+ rnorm(n,0,80)	
data <- data.frame(n = 1:n,
		age = age,
		time = time)
		
print(xtable(head(data)))

plot(data$age, data$time, xlab="Age", ylab="Usage time")

M0 <- lm(time ~ 1, data=data)
M1 <- lm(time ~ 1+ age, data=data)
data$age2 <- data$age^2
M2 <- lm(time ~ 1 + age + age2, data=data)

print(xtable(summary(M1)))

print(xtable(anova(M0, M1, M2)))

round( AIC(M0) )
AIC(M1)
AIC(M2)

plot(data$age, data$time, xlab="Age", ylab="Usage time")
abline(M0, lty=1)
abline(M1, lty=2)
data <- data[order(data$age), ]
lines(data$age, predict(M2, data), lty = 3)

### Fitting poly:

M3 <- lm(time ~ poly(data$age, 30, raw=TRUE), data=data)

plot(data$age, data$time, xlab="Age", ylab="Usage time")
lines(data$age, predict(M3, data))

yhat.M2.in <- predict(M2, data)
pred.in.m2 <- sum((data$time - yhat.M2.in)^2) / n
yhat.M3.in <- predict(M3, data)
pred.in.m3 <- sum((data$time - yhat.M3.in)^2) / n

### SHOULD BE TRUE:
print(pred.in.m2 > pred.in.m3)

n <- 1000
age <- runif(n, 20, 80)				
time <- 320 + 25*age - .3*age^2+ rnorm(n,0,80)	
data2 <- data.frame(n = 1:n,
		age = age,
		time = time)
yhat.M2.out <- predict(M2, data=data2)
pred.out.m2 <- sum((data2$time - yhat.M2.out)^2) / n
yhat.M3.out <- predict(M3, data=data2)
pred.out.m3 <- sum((data2$time - yhat.M3.out)^2) / n

### SHOULD BE FALSE
print(pred.out.m2 > pred.out.m3)

M4 <- lm.ridge(time ~ poly(data$age, 30, raw=TRUE), data=data, lambda=.01)

coefsM3 <- as.numeric(coef(M3))[1:5]
coefsM4 <- as.numeric(coef(M4))[1:5]
compare <- data.frame(M3 = coefsM3, M4=coefsM4)
print(xtable(compare))

yhat.M4.out <- coefsM4[1] + coefsM4[2] * data2$age + coefsM4[3] * data2$age^2 + coefsM4[4] * data2$age^3 
sum((data2$time - yhat.M4.out)^2) / n

### SHOULD BE TRUE
print(pred.out.m4 < pred.out.m3)


### GLMs:

library(arm)
n <- 100							
age <-  runif(n, 20, 80)
# use a logit function to compute the probabilty of usage:					
prob.adopt <- invlogit(320 + 25*age - .3*age^2+ rnorm(n,0,80) - 700)
# simulate usage:
adopt <- rbinom(n, 1, prob.adopt)	
data3 <- data.frame(n = 1:n,
		age = age,
		adopt = adopt)

plot(invlogit, xlim=c(-10,10), ylim=c(0,1))

plot(data3$age, jitter(data3$adopt))

data3$age2 <- data3$age^2
GM <- glm(adopt ~ 1 + age + age2, data=data3, family=binomial(logit))
plot(data3$age, jitter(data3$adopt))
data3 <- data3[order(data3$age), ]
lines(data3$age, predict(GM, data3, type="response"))

### WHY THE FUCK DO THE PLOTS NOT WORK...?


### MIXED MODELS:

# Setup the independent variables
n <- 100							
age <-  runif(n, 20, 80)
countries <- 20
countrycode <- sample(1:countries, n, TRUE)
data <- data.frame("id"=1:n, "country"=countrycode, "age"=age)

# Setup the data generating model
intercepts <- rnorm(countries, 0, 100)	
data <- ddply(data, .(country), function(x, intercepts){
	y <- intercepts[x$country] + 320 + 25*x$age - .3*x$age^2+ rnorm(length(x$age),0,20)
	return(data.frame("id"=x$id, "country"=x$country, "age"= x$age, "time" = y))
}, intercepts=intercepts)

# Plot:
library(lattice)
lattice.options(default.theme = standard.theme(color = FALSE))
xyplot(time ~ age, data=data, groups=country)

# Different approaches
data$age2 <- data$age^2
m.pooled <- lm(time ~ age + age2, data)
print( xtable( summary(m.pooled) ))
m.unpooled <- lm(time ~ -1 + as.factor(country) + age + age2, data)
print( xtable( summary(m.unpooled) ))
intercepts + 320
sd(intercepts + 320)
sd(coef(m.unpooled)[1:10])
#library(lme4)
m.mixed <- lmer(time ~ age + age2 + (1 | country), data)
summary(m.mixed)
xtable( display(m.mixed) )




glmer()

# illustration
group <- rep(1:5, each=10)
x <- group*10 + runif(50,-10,10)
xbar <- rep(ddply(data.frame(group=group, x=x), .(group), function(x){return(mean(x$x))})$V1, each=10)
y <- 100 + -5*x + 10*xbar + rnorm(50,0,5)
xyplot(y ~ x, groups=group)

